Still under constructions.
(III) Detailed List
- Read and load each line of source code of all the 250 movies.
- Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
- The data was collected on 2020-10-29.
# get source code of a single movie
h_get_movie_source_code = function(curr_movie_link) {
curr_movie_source_code = curr_movie_link %>%
readLines(encoding = "UTF-8")
return(curr_movie_source_code)
}
get_poster_from_movie_source_code = function(movie_source_code) {
poster_start_pattern = "<div class=\"poster\">"
poster_end_pattern = "</a> </div>"
poster_start_line = movie_source_code %>%
grep(pattern = poster_start_pattern)
lines_with_poster_end_pattern
}
# get basic info json from the single movie source code
h_get_basics_from_movie_source_code = function(movie_source_code) {
json_start_pattern = "<script type=\"application/ld\\+json\">\\{"
json_end_pattern = "\\}</script>"
json_start_line = movie_source_code %>%
grep(pattern = json_start_pattern)
json_end_line = movie_source_code %>%
grep(pattern = json_end_pattern) %>%
extract(1)
json_file = movie_source_code %>%
extract(json_start_line : json_end_line)
return(json_file)
}
# get box office info from the single movie source code
h_get_box_office_from_movie_source_code = function(movie_source_code) {
box_office_start_pattern = "<h3 class=\"subheading\">Box Office</h3>"
box_office_end_pattern = "<hr />"
box_office_start_line = movie_source_code %>%
grep(pattern = box_office_start_pattern)
lines_with_box_office_end_pattern = movie_source_code %>%
grep(pattern = box_office_end_pattern)
box_office_end_line = lines_with_box_office_end_pattern %>%
extract(lines_with_box_office_end_pattern %>%
is_greater_than(box_office_start_line) %>%
which() %>%
extract(1))
box_office = movie_source_code %>%
extract(box_office_start_line : box_office_end_line)
return(box_office)
}
h_get_basics_info = function(basics) {
}
curr_source_code = m_link[1] %>%
h_get_movie_source_code()
curr_basics = curr_source_code %>%
h_get_basics_from_movie_source_code()
curr_box_office = curr_source_code %>%
h_get_box_office_from_movie_source_code()
curr_basics %>% cat()
<script type="application/ld+json">{ "@context": "http://schema.org", "@type": "Movie", "url": "/title/tt0111161/", "name": "The Shawshank Redemption", "image": "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg", "genre": "Drama", "contentRating": "R", "actor": [ { "@type": "Person", "url": "/name/nm0000209/", "name": "Tim Robbins" }, { "@type": "Person", "url": "/name/nm0000151/", "name": "Morgan Freeman" }, { "@type": "Person", "url": "/name/nm0348409/", "name": "Bob Gunton" }, { "@type": "Person", "url": "/name/nm0006669/", "name": "William Sadler" } ], "director": { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, "creator": [ { "@type": "Person", "url": "/name/nm0000175/", "name": "Stephen King" }, { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, { "@type": "Organization", "url": "/company/co0040620/" } ], "description": "The Shawshank Redemption is a movie starring Tim Robbins, Morgan Freeman, and Bob Gunton. Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "datePublished": "1994-09-23", "keywords": "wrongful imprisonment,based on the works of stephen king,prison,escape from prison,voice over narration", "aggregateRating": { "@type": "AggregateRating", "ratingCount": 2299184, "bestRating": "10.0", "worstRating": "1.0", "ratingValue": "9.3" }, "review": { "@type": "Review", "itemReviewed": { "@type": "CreativeWork", "url": "/title/tt0111161/" }, "author": { "@type": "Person", "name": "carflo" }, "dateCreated": "2003-11-26", "inLanguage": "English", "name": "Tied for the best movie I have ever seen", "reviewBody": "Why do I want to write the 234th comment on The Shawshank Redemption? I am not sure - almost everything that could be possibly said about it has been said. But like so many other people who wrote comments, I was and am profoundly moved by this simple and eloquent depiction of hope and friendship and redemption. \n\nThe only other movie I have ever seen that effects me as strongly is To Kill a Mockingbird. Both movies leave me feeling cleaner for having watched them.\n\nI didn\u0027t intend to see this movie at all: I do not like prison movies and I don\u0027t normally watch them. I work at a branch library and one day as I was checking The Shawshank Redemption out to one of our older patrons, she said to me, \"Whenever I feel down or depressed, I check out this movie and watch it and it always makes me feel better.\" At the time, I thought that was very strange. One day there was nothing on TV except things I absolutely would not watch under any circumstance or things that I had seen too many times already. I remembered what she said, so I watched it. I have watched it many many times since then and it gets better with every showing.\n\nNo action, no special effects - just men in prison uniforms talking to each other.\n\nThe Shawshank Redemption and To Kill a Mockingbird are the best movies I have ever seen. I do not judge it by it\u0027s technical merits - I don\u0027t really care about that. I have read that Citizen Kane or The Godfather or this or that movie is the best movie ever made. They may have the best technique or be the most influential motion pictures ever made, but not the best. The best movies are ones that touch the soul. It takes a movie like The Shawshank Redemption to touch the soul.", "reviewRating": { "@type": "Rating", "worstRating": "1", "bestRating": "10", "ratingValue": "10" } }, "duration": "PT2H22M", "trailer": { "@type": "VideoObject", "name": "Official Trailer", "embedUrl": "/video/imdb/vi3877612057", "thumbnail": { "@type": "ImageObject", "contentUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg" }, "thumbnailUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg", "description": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "uploadDate": "2014-03-05T14:13:19Z" } }</script>
curr_box_office %>% cat()
<h3 class="subheading">Box Office</h3> <div class="txt-block"> <h4 class="inline">Budget:</h4>$25,000,000 <span class="attribute">(estimated)</span> </div> <div class="txt-block"> <h4 class="inline">Opening Weekend USA:</h4> $727,327, <span class="attribute">25 September 1994</span> </div> <div class="txt-block"> <h4 class="inline">Gross USA:</h4> $28,699,976 </div> <div class="txt-block"> <h4 class="inline">Cumulative Worldwide Gross:</h4> $28,815,291 </div> <span class="see-more inline"> <a href="https://pro.imdb.com/title/tt0111161?rf=cons_tt_bo_tt&ref_=cons_tt_bo_tt" >See more on IMDbPro</a> » </span> <hr />
| Title |
h1 itemprop="name" |
| Year |
Next line of Title |
| Content Rating |
meta itemprop="contentRating" |
| User Rating |
span itemprop="ratingValue" |
| Number of Rater |
itemprop="ratingCount" |
| Genre |
span class="itemprop" itemprop="genre" |
| Budget |
<h4 class="inline">Budget |
| Opening Weekend USA ($) |
<h4 class="inline">Opening Weekend USA |
| Gross USA ($) |
<h4 class="inline">Gross |
| Cumulative Worldwide Gross ($) |
<h4 class="inline">Cumulative |
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
temp=readLines(con=input,encoding="UTF-8")
#1. title----
temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
#2. year----
temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
#3. content rating----
temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
if (length(temp.movie_content_rating)==1){
temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
}
if (length(temp.movie_content_rating)==0){
temp.movie_content_rating="-"
}
#4. user rating----
temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
#5. number of rater----
temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
#6. genre----
temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
temp.movie_genre.l=length(temp.movie_genre)
for (i in 1:temp.movie_genre.l){
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
}
remove(i,temp.movie_genre.l)
temp.movie_genre=paste(temp.movie_genre,collapse=", ")
#7. budget----
temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
if (length(temp.movie_budget)==1){
temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
a=strsplit(temp.movie_budget,split="")[[1]]
if (paste(a[1],a[2],a[3],sep="")=="FRF"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="JPY"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="INR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="DEM"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="RUR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="TRL"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="AUD"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="KRW"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="€"){
temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="£"){
temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
}
remove(a)
}
if (length(temp.movie_budget)==0){
temp.movie_budget="-"
}
#8. opening----
temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
if (length(temp.movie_opening)==1){
temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
a=strsplit(temp.movie_opening,split="")[[1]]
if (a[length(a)]==","){
temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
}
remove(a)
}
if (length(temp.movie_opening)==0){
temp.movie_opening="-"
}
#9. gross----
temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
if (length(temp.movie_gross)==1){
temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
}
remove(a)
}
if (length(temp.movie_gross)==0){
temp.movie_gross="-"
}
#10. worldwide gross----
temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
if (length(temp.movie_worldwide_gross)==1){
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
}
remove(a)
}
if (length(temp.movie_worldwide_gross)==0){
temp.movie_worldwide_gross="-"
}
#11. result----
return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}
#Collecting data----
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
temp.target.info=get.target.info(movie_link[i])
movie_title=c(movie_title,temp.target.info[1])
movie_year=c(movie_year,temp.target.info[2])
movie_content_rating=c(movie_content_rating,temp.target.info[3])
movie_user_rating=c(movie_user_rating,temp.target.info[4])
movie_num_rater=c(movie_num_rater,temp.target.info[5])
movie_genre=c(movie_genre,temp.target.info[6])
movie_budget=c(movie_budget,temp.target.info[7])
movie_opening=c(movie_opening,temp.target.info[8])
movie_gross=c(movie_gross,temp.target.info[9])
movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}
#Visualization----
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))